pip install xgboost
Collecting xgboost
Downloading xgboost-2.1.1-py3-none-win_amd64.whl (124.9 MB)
-------------------------------------- 124.9/124.9 MB 7.0 MB/s eta 0:00:00
Requirement already satisfied: scipy in c:\users\dell\anaconda3\lib\site-packages (from xgboost) (1.10.0)
Requirement already satisfied: numpy in c:\users\dell\anaconda3\lib\site-packages (from xgboost) (1.23.5)
Installing collected packages: xgboost
Successfully installed xgboost-2.1.1
Note: you may need to restart the kernel to use updated packages.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, ShuffleSplit, learning_curve, StratifiedKFold, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, roc_auc_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.impute import SimpleImputer
import pandas as pd
import re
import seaborn as sns
from scipy import stats
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import RandomizedSearchCV
df = pd.read_csv("D:/Swinburne/Sem 3/Big Data/HD/apm_point_sold_nsw.csv")
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1140979 entries, 0 to 1140978 Data columns (total 65 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 FID 1140979 non-null object 1 propertycategorisation 1140979 non-null object 2 streetname 1140885 non-null object 3 streettype 1129977 non-null object 4 suburb 1140979 non-null object 5 postcode 1140979 non-null int64 6 state 1140979 non-null object 7 sa1code 1019459 non-null float64 8 eventdate 1140979 non-null object 9 eventprice 1111823 non-null float64 10 eventtypecode 1140979 non-null object 11 firstadvertisedeventdate 885428 non-null object 12 firstadvertisedeventprice 542927 non-null float64 13 lastadvertisedeventdate 885428 non-null object 14 lastadvertisedeventprice 625317 non-null float64 15 finalresulteventdate 1140944 non-null object 16 finalresulteventprice 1128291 non-null float64 17 finalresulteventtypecode 1140907 non-null object 18 propertytype 1140979 non-null object 19 areasize 1108638 non-null float64 20 bedrooms 927430 non-null float64 21 baths 888670 non-null float64 22 parking 887961 non-null float64 23 hasstudy 191923 non-null object 24 hasseparatedining 62496 non-null object 25 hasfamilyroom 13067 non-null object 26 hassunroom 32661 non-null object 27 hasbilliardroom 0 non-null float64 28 hasrumpusroom 65591 non-null object 29 hasfireplace 71231 non-null object 30 haswalkinwardrobe 113388 non-null object 31 hascourtyard 94148 non-null object 32 hasinternallaundry 183089 non-null object 33 hasheating 122832 non-null object 34 hassauna 0 non-null float64 35 hasairconditioning 459991 non-null object 36 hasbalcony 229454 non-null object 37 hasbarbeque 56354 non-null object 38 haspolishedtimberfloor 47489 non-null object 39 hasensuite 336924 non-null object 40 hasspa 28979 non-null object 41 hasgarage 100683 non-null object 42 haslockupgarage 122256 non-null object 43 haspool 47729 non-null object 44 hastenniscourt 4183 non-null object 45 hasbeenrenovated 0 non-null float64 46 hasalarm 68698 non-null object 47 haswaterview 0 non-null float64 48 hasharbourview 0 non-null float64 49 hasoceanview 0 non-null float64 50 hascityview 0 non-null float64 51 hasbushview 0 non-null float64 52 hasdistrictview 0 non-null float64 53 hasbayview 0 non-null float64 54 hasparkview 0 non-null float64 55 hasriverview 0 non-null float64 56 hasmountainview 0 non-null float64 57 advertlinkexternalid 961984 non-null float64 58 property_latitude 1111457 non-null float64 59 property_longitude 1111457 non-null float64 60 street_centroid_latitude 1133509 non-null float64 61 street_centroid_longitude 1133509 non-null float64 62 month 1140979 non-null int64 63 year 1140979 non-null int64 64 the_geom 1111457 non-null object dtypes: float64(27), int64(3), object(35) memory usage: 565.8+ MB
# Display the names of all columns
print("Column Names:")
print(df.columns)
# Display data types of each column
print("\nData Types of Columns:")
print(df.dtypes)
# Display a summary of non-null counts and data types
print("\nInfo Summary:")
print(df.info())
# Display the first few rows to get a sense of the data in each column
print("\nFirst Few Rows of the Dataset:")
print(df.head())
# Display the unique values and their counts for categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
print(f"\nUnique Values in '{col}':")
print(df[col].value_counts())
# Display basic statistics for numerical columns
print("\nDescriptive Statistics for Numerical Columns:")
print(df.describe())
Column Names:
Index(['FID', 'propertycategorisation', 'streetname', 'streettype', 'suburb',
'postcode', 'state', 'sa1code', 'eventdate', 'eventprice',
'eventtypecode', 'firstadvertisedeventdate',
'firstadvertisedeventprice', 'lastadvertisedeventdate',
'lastadvertisedeventprice', 'finalresulteventdate',
'finalresulteventprice', 'finalresulteventtypecode', 'propertytype',
'areasize', 'bedrooms', 'baths', 'parking', 'hasstudy',
'hasseparatedining', 'hasfamilyroom', 'hassunroom', 'hasbilliardroom',
'hasrumpusroom', 'hasfireplace', 'haswalkinwardrobe', 'hascourtyard',
'hasinternallaundry', 'hasheating', 'hassauna', 'hasairconditioning',
'hasbalcony', 'hasbarbeque', 'haspolishedtimberfloor', 'hasensuite',
'hasspa', 'hasgarage', 'haslockupgarage', 'haspool', 'hastenniscourt',
'hasbeenrenovated', 'hasalarm', 'haswaterview', 'hasharbourview',
'hasoceanview', 'hascityview', 'hasbushview', 'hasdistrictview',
'hasbayview', 'hasparkview', 'hasriverview', 'hasmountainview',
'advertlinkexternalid', 'property_latitude', 'property_longitude',
'street_centroid_latitude', 'street_centroid_longitude', 'month',
'year', 'the_geom'],
dtype='object')
Data Types of Columns:
FID object
propertycategorisation object
streetname object
streettype object
suburb object
...
street_centroid_latitude float64
street_centroid_longitude float64
month int64
year int64
the_geom object
Length: 65, dtype: object
Info Summary:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1140979 entries, 0 to 1140978
Data columns (total 65 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 FID 1140979 non-null object
1 propertycategorisation 1140979 non-null object
2 streetname 1140885 non-null object
3 streettype 1129977 non-null object
4 suburb 1140979 non-null object
5 postcode 1140979 non-null int64
6 state 1140979 non-null object
7 sa1code 1019459 non-null float64
8 eventdate 1140979 non-null object
9 eventprice 1111823 non-null float64
10 eventtypecode 1140979 non-null object
11 firstadvertisedeventdate 885428 non-null object
12 firstadvertisedeventprice 542927 non-null float64
13 lastadvertisedeventdate 885428 non-null object
14 lastadvertisedeventprice 625317 non-null float64
15 finalresulteventdate 1140944 non-null object
16 finalresulteventprice 1128291 non-null float64
17 finalresulteventtypecode 1140907 non-null object
18 propertytype 1140979 non-null object
19 areasize 1108638 non-null float64
20 bedrooms 927430 non-null float64
21 baths 888670 non-null float64
22 parking 887961 non-null float64
23 hasstudy 191923 non-null object
24 hasseparatedining 62496 non-null object
25 hasfamilyroom 13067 non-null object
26 hassunroom 32661 non-null object
27 hasbilliardroom 0 non-null float64
28 hasrumpusroom 65591 non-null object
29 hasfireplace 71231 non-null object
30 haswalkinwardrobe 113388 non-null object
31 hascourtyard 94148 non-null object
32 hasinternallaundry 183089 non-null object
33 hasheating 122832 non-null object
34 hassauna 0 non-null float64
35 hasairconditioning 459991 non-null object
36 hasbalcony 229454 non-null object
37 hasbarbeque 56354 non-null object
38 haspolishedtimberfloor 47489 non-null object
39 hasensuite 336924 non-null object
40 hasspa 28979 non-null object
41 hasgarage 100683 non-null object
42 haslockupgarage 122256 non-null object
43 haspool 47729 non-null object
44 hastenniscourt 4183 non-null object
45 hasbeenrenovated 0 non-null float64
46 hasalarm 68698 non-null object
47 haswaterview 0 non-null float64
48 hasharbourview 0 non-null float64
49 hasoceanview 0 non-null float64
50 hascityview 0 non-null float64
51 hasbushview 0 non-null float64
52 hasdistrictview 0 non-null float64
53 hasbayview 0 non-null float64
54 hasparkview 0 non-null float64
55 hasriverview 0 non-null float64
56 hasmountainview 0 non-null float64
57 advertlinkexternalid 961984 non-null float64
58 property_latitude 1111457 non-null float64
59 property_longitude 1111457 non-null float64
60 street_centroid_latitude 1133509 non-null float64
61 street_centroid_longitude 1133509 non-null float64
62 month 1140979 non-null int64
63 year 1140979 non-null int64
64 the_geom 1111457 non-null object
dtypes: float64(27), int64(3), object(35)
memory usage: 565.8+ MB
None
First Few Rows of the Dataset:
FID propertycategorisation streetname streettype \
0 apm_point_sold_nsw.374101 House Federation Dr
1 apm_point_sold_nsw.374102 House Koel Pl
2 apm_point_sold_nsw.374103 House Stanley St
3 apm_point_sold_nsw.374104 House Peter Cr
4 apm_point_sold_nsw.374105 House Babbin Pl
suburb postcode state sa1code eventdate eventprice \
0 Medowie 2318 NSW 1.060311e+10 2021-05-28 950000.0
1 Woronora Heights 2233 NSW 1.280216e+10 2019-05-08 1050100.0
2 Mona Vale 2103 NSW 1.220214e+10 2020-08-21 3000000.0
3 Greenacre 2190 NSW 1.190114e+10 2022-05-07 1330000.0
4 Caringbah South 2229 NSW 1.280116e+10 2023-10-27 2245000.0
... hasriverview hasmountainview advertlinkexternalid property_latitude \
0 ... NaN NaN 2.016998e+09 -32.73393
1 ... NaN NaN NaN -34.03078
2 ... NaN NaN 2.016396e+09 -33.67632
3 ... NaN NaN 2.017728e+09 -33.89731
4 ... NaN NaN 2.018629e+09 -34.04770
property_longitude street_centroid_latitude street_centroid_longitude \
0 151.87674 -32.73197 151.87641
1 151.02859 -34.03108 151.02864
2 151.31369 -33.67584 151.31403
3 151.05179 -33.89696 151.05268
4 151.11682 -34.04781 151.11606
month year the_geom
0 5 2021 POINT (-32.73393 151.87674)
1 5 2019 POINT (-34.03078 151.02859)
2 8 2020 POINT (-33.67632 151.31369)
3 5 2022 POINT (-33.89731 151.05179)
4 10 2023 POINT (-34.0477 151.11682)
[5 rows x 65 columns]
Unique Values in 'FID':
apm_point_sold_nsw.374101 1
apm_point_sold_nsw.1138568 1
apm_point_sold_nsw.1139843 1
apm_point_sold_nsw.1138573 1
apm_point_sold_nsw.1138572 1
..
apm_point_sold_nsw.756321 1
apm_point_sold_nsw.756322 1
apm_point_sold_nsw.756323 1
apm_point_sold_nsw.756324 1
apm_point_sold_nsw.374100 1
Name: FID, Length: 1140979, dtype: int64
Unique Values in 'propertycategorisation':
House 781518
Unit 359461
Name: propertycategorisation, dtype: int64
Unique Values in 'streetname':
Pacific 7785
Victoria 5669
George 4332
Park 4127
Railway 3638
...
Sol 1
Greatrex 1
Fels 1
Nandillion Ponds 1
Haverty 1
Name: streetname, Length: 28046, dtype: int64
Unique Values in 'streettype':
St 458317
Rd 205858
Av 143189
Dr 67916
Pl 50583
...
Pur 1
Fitr 1
Ra 1
La S 1
Brae 1
Name: streettype, Length: 186, dtype: int64
Unique Values in 'suburb':
Port Macquarie 8566
Blacktown 7424
Schofields 6392
Orange 6062
Ryde 5818
...
Mulloon 1
Auburn Vale 1
Burrier 1
Booyong 1
Wombeyan Caves 1
Name: suburb, Length: 2541, dtype: int64
Unique Values in 'state':
NSW 1140979
Name: state, dtype: int64
Unique Values in 'eventdate':
2021-03-27 1713
2019-12-20 1641
2021-03-31 1500
2020-12-18 1409
2021-04-01 1402
...
2022-01-02 12
2019-12-25 11
2023-12-26 11
2021-12-26 9
2022-12-25 8
Name: eventdate, Length: 1947, dtype: int64
Unique Values in 'eventtypecode':
PTSD 943755
AUSD 70111
AUSP 62382
PTSW 43365
AUSA 7605
AUSN 6427
AUPN 6169
AUSS 1165
Name: eventtypecode, dtype: int64
Unique Values in 'firstadvertisedeventdate':
2020-03-08 1956
2019-04-16 1827
2019-05-07 1786
2019-02-19 1764
2020-03-26 1694
...
2013-10-18 1
2015-01-21 1
2013-09-04 1
2015-06-24 1
2015-03-19 1
Name: firstadvertisedeventdate, Length: 2988, dtype: int64
Unique Values in 'lastadvertisedeventdate':
2020-04-15 36862
2019-02-20 4499
2020-09-09 4389
2019-03-20 2188
2019-03-13 2004
...
2018-12-30 1
2018-09-18 1
2018-10-05 1
2018-10-18 1
2018-10-04 1
Name: lastadvertisedeventdate, Length: 2078, dtype: int64
Unique Values in 'finalresulteventdate':
2021-06-21 4552
2019-11-13 2307
2019-12-20 1605
2021-04-01 1515
2021-03-31 1484
...
2018-10-08 1
2018-07-13 1
2018-04-09 1
2017-01-27 1
2018-11-03 1
Name: finalresulteventdate, Length: 2147, dtype: int64
Unique Values in 'finalresulteventtypecode':
PTSD 1020208
PTSW 47484
AUSD 47470
PTLA 12988
AUSP 8574
AUSA 2734
AUPN 817
AUSN 531
AUSS 101
Name: finalresulteventtypecode, dtype: int64
Unique Values in 'propertytype':
House 687691
Unit 348071
Townhouse 77207
Duplex 8657
Semi 7991
Villa 4493
Studio 2688
Cottage 2112
Terrace 2024
Flat 43
Triplex 2
Name: propertytype, dtype: int64
Unique Values in 'hasstudy':
True 191923
Name: hasstudy, dtype: int64
Unique Values in 'hasseparatedining':
True 62496
Name: hasseparatedining, dtype: int64
Unique Values in 'hasfamilyroom':
True 13067
Name: hasfamilyroom, dtype: int64
Unique Values in 'hassunroom':
True 32661
Name: hassunroom, dtype: int64
Unique Values in 'hasrumpusroom':
True 65591
Name: hasrumpusroom, dtype: int64
Unique Values in 'hasfireplace':
True 71231
Name: hasfireplace, dtype: int64
Unique Values in 'haswalkinwardrobe':
True 113388
Name: haswalkinwardrobe, dtype: int64
Unique Values in 'hascourtyard':
True 94148
Name: hascourtyard, dtype: int64
Unique Values in 'hasinternallaundry':
True 183089
Name: hasinternallaundry, dtype: int64
Unique Values in 'hasheating':
True 122832
Name: hasheating, dtype: int64
Unique Values in 'hasairconditioning':
True 459991
Name: hasairconditioning, dtype: int64
Unique Values in 'hasbalcony':
True 229454
Name: hasbalcony, dtype: int64
Unique Values in 'hasbarbeque':
True 56354
Name: hasbarbeque, dtype: int64
Unique Values in 'haspolishedtimberfloor':
True 47489
Name: haspolishedtimberfloor, dtype: int64
Unique Values in 'hasensuite':
True 336924
Name: hasensuite, dtype: int64
Unique Values in 'hasspa':
True 28979
Name: hasspa, dtype: int64
Unique Values in 'hasgarage':
True 100683
Name: hasgarage, dtype: int64
Unique Values in 'haslockupgarage':
True 122256
Name: haslockupgarage, dtype: int64
Unique Values in 'haspool':
True 47729
Name: haspool, dtype: int64
Unique Values in 'hastenniscourt':
True 4183
Name: hastenniscourt, dtype: int64
Unique Values in 'hasalarm':
True 68698
Name: hasalarm, dtype: int64
Unique Values in 'the_geom':
POINT (-33.82402 151.19541) 558
POINT (-33.85922 151.06267) 557
POINT (-33.9158 151.03215) 436
POINT (-33.77875 151.12151) 405
POINT (-33.69337 150.92687) 388
...
POINT (-34.01225 150.73758) 1
POINT (-35.84237 150.17492) 1
POINT (-28.84233 153.04367) 1
POINT (-32.83757 151.362) 1
POINT (-33.87655 151.16296) 1
Name: the_geom, Length: 515056, dtype: int64
Descriptive Statistics for Numerical Columns:
postcode sa1code eventprice firstadvertisedeventprice \
count 1.140979e+06 1.019459e+06 1.111823e+06 5.429270e+05
mean 2.321050e+03 1.157394e+10 1.131238e+06 9.104721e+05
std 2.412932e+02 7.880421e+08 2.178570e+06 5.274477e+06
min 2.000000e+03 1.010210e+10 0.000000e+00 1.000000e+01
25% 2.130000e+03 1.090312e+10 5.600000e+05 5.236880e+05
50% 2.251000e+03 1.170313e+10 7.911000e+05 7.000000e+05
75% 2.527000e+03 1.220214e+10 1.240000e+06 9.500000e+05
max 4.383000e+03 1.280216e+10 1.012500e+09 2.147484e+09
lastadvertisedeventprice finalresulteventprice areasize \
count 6.253170e+05 1.128291e+06 1.108638e+06
mean 8.995034e+05 1.127100e+06 5.609188e+03
std 3.695883e+06 1.823696e+06 1.495246e+06
min 1.000000e+01 0.000000e+00 1.000000e+00
25% 5.290000e+05 5.590000e+05 4.430000e+02
50% 7.100000e+05 7.900000e+05 6.640000e+02
75% 9.600000e+05 1.245000e+06 1.041000e+03
max 1.900000e+09 1.723654e+08 1.304260e+09
bedrooms baths parking ... hasparkview \
count 927430.000000 888670.000000 887961.000000 ... 0.0
mean 3.175801 1.757097 1.733511 ... NaN
std 1.133642 0.771326 1.093473 ... NaN
min 1.000000 1.000000 0.000000 ... NaN
25% 2.000000 1.000000 1.000000 ... NaN
50% 3.000000 2.000000 2.000000 ... NaN
75% 4.000000 2.000000 2.000000 ... NaN
max 32.000000 20.000000 70.000000 ... NaN
hasriverview hasmountainview advertlinkexternalid property_latitude \
count 0.0 0.0 9.619840e+05 1.111457e+06
mean NaN NaN 1.904397e+09 -3.348589e+01
std NaN NaN 4.607707e+08 1.338444e+00
min NaN NaN 8.312490e+05 -3.725666e+01
25% NaN NaN 2.015695e+09 -3.393300e+01
50% NaN NaN 2.016988e+09 -3.381163e+01
75% NaN NaN 2.018063e+09 -3.343032e+01
max NaN NaN 2.019327e+09 -2.816608e+01
property_longitude street_centroid_latitude \
count 1.111457e+06 1.133509e+06
mean 1.509859e+02 -3.349004e+01
std 1.216910e+00 1.331924e+00
min 1.412155e+02 -3.725511e+01
25% 1.508719e+02 -3.393406e+01
50% 1.510985e+02 -3.381207e+01
75% 1.512698e+02 -3.343280e+01
max 1.536366e+02 -2.816663e+01
street_centroid_longitude month year
count 1.133509e+06 1.140979e+06 1.140979e+06
mean 1.509864e+02 6.562094e+00 2.021192e+03
std 1.209980e+00 3.400298e+00 1.490047e+00
min 1.411078e+02 1.000000e+00 2.019000e+03
25% 1.508692e+02 3.000000e+00 2.020000e+03
50% 1.510980e+02 7.000000e+00 2.021000e+03
75% 1.512680e+02 1.000000e+01 2.022000e+03
max 1.536349e+02 1.200000e+01 2.024000e+03
[8 rows x 30 columns]
df.head()
| FID | propertycategorisation | streetname | streettype | suburb | postcode | state | sa1code | eventdate | eventprice | ... | hasriverview | hasmountainview | advertlinkexternalid | property_latitude | property_longitude | street_centroid_latitude | street_centroid_longitude | month | year | the_geom | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | apm_point_sold_nsw.374101 | House | Federation | Dr | Medowie | 2318 | NSW | 1.060311e+10 | 2021-05-28 | 950000.0 | ... | NaN | NaN | 2.016998e+09 | -32.73393 | 151.87674 | -32.73197 | 151.87641 | 5 | 2021 | POINT (-32.73393 151.87674) |
| 1 | apm_point_sold_nsw.374102 | House | Koel | Pl | Woronora Heights | 2233 | NSW | 1.280216e+10 | 2019-05-08 | 1050100.0 | ... | NaN | NaN | NaN | -34.03078 | 151.02859 | -34.03108 | 151.02864 | 5 | 2019 | POINT (-34.03078 151.02859) |
| 2 | apm_point_sold_nsw.374103 | House | Stanley | St | Mona Vale | 2103 | NSW | 1.220214e+10 | 2020-08-21 | 3000000.0 | ... | NaN | NaN | 2.016396e+09 | -33.67632 | 151.31369 | -33.67584 | 151.31403 | 8 | 2020 | POINT (-33.67632 151.31369) |
| 3 | apm_point_sold_nsw.374104 | House | Peter | Cr | Greenacre | 2190 | NSW | 1.190114e+10 | 2022-05-07 | 1330000.0 | ... | NaN | NaN | 2.017728e+09 | -33.89731 | 151.05179 | -33.89696 | 151.05268 | 5 | 2022 | POINT (-33.89731 151.05179) |
| 4 | apm_point_sold_nsw.374105 | House | Babbin | Pl | Caringbah South | 2229 | NSW | 1.280116e+10 | 2023-10-27 | 2245000.0 | ... | NaN | NaN | 2.018629e+09 | -34.04770 | 151.11682 | -34.04781 | 151.11606 | 10 | 2023 | POINT (-34.0477 151.11682) |
5 rows × 65 columns
df.describe()
| postcode | sa1code | eventprice | firstadvertisedeventprice | lastadvertisedeventprice | finalresulteventprice | areasize | bedrooms | baths | parking | ... | hasparkview | hasriverview | hasmountainview | advertlinkexternalid | property_latitude | property_longitude | street_centroid_latitude | street_centroid_longitude | month | year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1.140979e+06 | 1.019459e+06 | 1.111823e+06 | 5.429270e+05 | 6.253170e+05 | 1.128291e+06 | 1.108638e+06 | 927430.000000 | 888670.000000 | 887961.000000 | ... | 0.0 | 0.0 | 0.0 | 9.619840e+05 | 1.111457e+06 | 1.111457e+06 | 1.133509e+06 | 1.133509e+06 | 1.140979e+06 | 1.140979e+06 |
| mean | 2.321050e+03 | 1.157394e+10 | 1.131238e+06 | 9.104721e+05 | 8.995034e+05 | 1.127100e+06 | 5.609188e+03 | 3.175801 | 1.757097 | 1.733511 | ... | NaN | NaN | NaN | 1.904397e+09 | -3.348589e+01 | 1.509859e+02 | -3.349004e+01 | 1.509864e+02 | 6.562094e+00 | 2.021192e+03 |
| std | 2.412932e+02 | 7.880421e+08 | 2.178570e+06 | 5.274477e+06 | 3.695883e+06 | 1.823696e+06 | 1.495246e+06 | 1.133642 | 0.771326 | 1.093473 | ... | NaN | NaN | NaN | 4.607707e+08 | 1.338444e+00 | 1.216910e+00 | 1.331924e+00 | 1.209980e+00 | 3.400298e+00 | 1.490047e+00 |
| min | 2.000000e+03 | 1.010210e+10 | 0.000000e+00 | 1.000000e+01 | 1.000000e+01 | 0.000000e+00 | 1.000000e+00 | 1.000000 | 1.000000 | 0.000000 | ... | NaN | NaN | NaN | 8.312490e+05 | -3.725666e+01 | 1.412155e+02 | -3.725511e+01 | 1.411078e+02 | 1.000000e+00 | 2.019000e+03 |
| 25% | 2.130000e+03 | 1.090312e+10 | 5.600000e+05 | 5.236880e+05 | 5.290000e+05 | 5.590000e+05 | 4.430000e+02 | 2.000000 | 1.000000 | 1.000000 | ... | NaN | NaN | NaN | 2.015695e+09 | -3.393300e+01 | 1.508719e+02 | -3.393406e+01 | 1.508692e+02 | 3.000000e+00 | 2.020000e+03 |
| 50% | 2.251000e+03 | 1.170313e+10 | 7.911000e+05 | 7.000000e+05 | 7.100000e+05 | 7.900000e+05 | 6.640000e+02 | 3.000000 | 2.000000 | 2.000000 | ... | NaN | NaN | NaN | 2.016988e+09 | -3.381163e+01 | 1.510985e+02 | -3.381207e+01 | 1.510980e+02 | 7.000000e+00 | 2.021000e+03 |
| 75% | 2.527000e+03 | 1.220214e+10 | 1.240000e+06 | 9.500000e+05 | 9.600000e+05 | 1.245000e+06 | 1.041000e+03 | 4.000000 | 2.000000 | 2.000000 | ... | NaN | NaN | NaN | 2.018063e+09 | -3.343032e+01 | 1.512698e+02 | -3.343280e+01 | 1.512680e+02 | 1.000000e+01 | 2.022000e+03 |
| max | 4.383000e+03 | 1.280216e+10 | 1.012500e+09 | 2.147484e+09 | 1.900000e+09 | 1.723654e+08 | 1.304260e+09 | 32.000000 | 20.000000 | 70.000000 | ... | NaN | NaN | NaN | 2.019327e+09 | -2.816608e+01 | 1.536366e+02 | -2.816663e+01 | 1.536349e+02 | 1.200000e+01 | 2.024000e+03 |
8 rows × 30 columns
# List of columns with no values
columns_with_no_values = [
'hasbilliardroom', 'hassauna', 'hasbeenrenovated',
'haswaterview', 'hasharbourview', 'hasoceanview',
'hascityview', 'hasbushview', 'hasdistrictview',
'hasbayview', 'hasparkview', 'hasriverview',
'hasmountainview'
]
# Drop columns with no values
df.drop(columns=columns_with_no_values, inplace=True)
# List of columns with very high missing values
columns_with_high_missing_values = [
'hasstudy', 'hasseparatedining', 'hasfamilyroom',
'hassunroom', 'hasrumpusroom', 'hasfireplace',
'haswalkinwardrobe', 'hascourtyard', 'hasinternallaundry',
'hasheating', 'hasairconditioning', 'hasbalcony',
'hasbarbeque', 'haspolishedtimberfloor', 'hasensuite',
'hasspa', 'hasgarage', 'haslockupgarage',
'haspool', 'hastenniscourt', 'hasalarm'
]
# Replace missing values with 0
df[columns_with_high_missing_values] = df[columns_with_high_missing_values].fillna(False)
# Save the cleaned DataFrame to a new CSV file
df.to_csv('cleaned_data.csv', index=False)
# List all columns and the number of missing values in each column
missing_values = df.isnull().sum()
missing_values = missing_values[missing_values > 0]
# Display the columns with their number of missing values
print("Columns with missing values and the number of missing values:")
print(missing_values)
Columns with missing values and the number of missing values: streetname 94 streettype 11002 sa1code 121520 eventprice 29156 firstadvertisedeventdate 255551 firstadvertisedeventprice 598052 lastadvertisedeventdate 255551 lastadvertisedeventprice 515662 finalresulteventdate 35 finalresulteventprice 12688 finalresulteventtypecode 72 areasize 32341 bedrooms 213549 baths 252309 parking 253018 advertlinkexternalid 178995 property_latitude 29522 property_longitude 29522 street_centroid_latitude 7470 street_centroid_longitude 7470 the_geom 29522 dtype: int64
# Calculate the threshold for 70% missing data
threshold = 0.7 * len(df)
# Filter columns with missing values exceeding the threshold
df = df.loc[:, df.isnull().sum() < threshold]
# Save the cleaned DataFrame
df.to_csv('cleaned_data.csv', index=False) # Save the cleaned DataFrame
# List all columns and the number of missing values in each column
missing_values = df.isnull().sum()
missing_values = missing_values[missing_values > 0]
# Display the columns with their number of missing values
print("Columns with missing values and the number of missing values:")
print(missing_values)
Columns with missing values and the number of missing values: streetname 94 streettype 11002 sa1code 121520 eventprice 29156 firstadvertisedeventdate 255551 firstadvertisedeventprice 598052 lastadvertisedeventdate 255551 lastadvertisedeventprice 515662 finalresulteventdate 35 finalresulteventprice 12688 finalresulteventtypecode 72 areasize 32341 bedrooms 213549 baths 252309 parking 253018 advertlinkexternalid 178995 property_latitude 29522 property_longitude 29522 street_centroid_latitude 7470 street_centroid_longitude 7470 the_geom 29522 dtype: int64
df.head()
| FID | propertycategorisation | streetname | streettype | suburb | postcode | state | sa1code | eventdate | eventprice | ... | hastenniscourt | hasalarm | advertlinkexternalid | property_latitude | property_longitude | street_centroid_latitude | street_centroid_longitude | month | year | the_geom | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | apm_point_sold_nsw.374101 | House | Federation | Dr | Medowie | 2318 | NSW | 1.060311e+10 | 2021-05-28 | 950000.0 | ... | False | False | 2.016998e+09 | -32.73393 | 151.87674 | -32.73197 | 151.87641 | 5 | 2021 | POINT (-32.73393 151.87674) |
| 1 | apm_point_sold_nsw.374102 | House | Koel | Pl | Woronora Heights | 2233 | NSW | 1.280216e+10 | 2019-05-08 | 1050100.0 | ... | False | False | NaN | -34.03078 | 151.02859 | -34.03108 | 151.02864 | 5 | 2019 | POINT (-34.03078 151.02859) |
| 2 | apm_point_sold_nsw.374103 | House | Stanley | St | Mona Vale | 2103 | NSW | 1.220214e+10 | 2020-08-21 | 3000000.0 | ... | False | False | 2.016396e+09 | -33.67632 | 151.31369 | -33.67584 | 151.31403 | 8 | 2020 | POINT (-33.67632 151.31369) |
| 3 | apm_point_sold_nsw.374104 | House | Peter | Cr | Greenacre | 2190 | NSW | 1.190114e+10 | 2022-05-07 | 1330000.0 | ... | False | False | 2.017728e+09 | -33.89731 | 151.05179 | -33.89696 | 151.05268 | 5 | 2022 | POINT (-33.89731 151.05179) |
| 4 | apm_point_sold_nsw.374105 | House | Babbin | Pl | Caringbah South | 2229 | NSW | 1.280116e+10 | 2023-10-27 | 2245000.0 | ... | False | True | 2.018629e+09 | -34.04770 | 151.11682 | -34.04781 | 151.11606 | 10 | 2023 | POINT (-34.0477 151.11682) |
5 rows × 52 columns
# Function to update the_geom based on property_latitude and property_longitude or vice versa
def update_geom(row):
# If the_geom is missing but latitude and longitude are present
if pd.isnull(row['the_geom']) and pd.notnull(row['property_latitude']) and pd.notnull(row['property_longitude']):
return f"POINT ({row['property_longitude']} {row['property_latitude']})"
# If latitude and longitude are missing but the_geom has value
elif pd.notnull(row['the_geom']):
# Extract latitude and longitude from the_geom
match = re.match(r"POINT \(([-\d.]+) ([-\d.]+)\)", row['the_geom'])
if match:
row['property_longitude'], row['property_latitude'] = map(float, match.groups())
return row['the_geom']
else:
return None
# Apply the function to update 'the_geom' column and extract latitude and longitude if missing
df['the_geom'] = df.apply(update_geom, axis=1)
# Drop rows where both the_geom and latitude/longitude are missing
df = df.dropna(subset=['the_geom', 'property_latitude', 'property_longitude'], how='all')
# Display the updated DataFrame
df.head()
| FID | propertycategorisation | streetname | streettype | suburb | postcode | state | sa1code | eventdate | eventprice | ... | hastenniscourt | hasalarm | advertlinkexternalid | property_latitude | property_longitude | street_centroid_latitude | street_centroid_longitude | month | year | the_geom | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | apm_point_sold_nsw.374101 | House | Federation | Dr | Medowie | 2318 | NSW | 1.060311e+10 | 2021-05-28 | 950000.0 | ... | False | False | 2.016998e+09 | -32.73393 | 151.87674 | -32.73197 | 151.87641 | 5 | 2021 | POINT (-32.73393 151.87674) |
| 1 | apm_point_sold_nsw.374102 | House | Koel | Pl | Woronora Heights | 2233 | NSW | 1.280216e+10 | 2019-05-08 | 1050100.0 | ... | False | False | NaN | -34.03078 | 151.02859 | -34.03108 | 151.02864 | 5 | 2019 | POINT (-34.03078 151.02859) |
| 2 | apm_point_sold_nsw.374103 | House | Stanley | St | Mona Vale | 2103 | NSW | 1.220214e+10 | 2020-08-21 | 3000000.0 | ... | False | False | 2.016396e+09 | -33.67632 | 151.31369 | -33.67584 | 151.31403 | 8 | 2020 | POINT (-33.67632 151.31369) |
| 3 | apm_point_sold_nsw.374104 | House | Peter | Cr | Greenacre | 2190 | NSW | 1.190114e+10 | 2022-05-07 | 1330000.0 | ... | False | False | 2.017728e+09 | -33.89731 | 151.05179 | -33.89696 | 151.05268 | 5 | 2022 | POINT (-33.89731 151.05179) |
| 4 | apm_point_sold_nsw.374105 | House | Babbin | Pl | Caringbah South | 2229 | NSW | 1.280116e+10 | 2023-10-27 | 2245000.0 | ... | False | True | 2.018629e+09 | -34.04770 | 151.11682 | -34.04781 | 151.11606 | 10 | 2023 | POINT (-34.0477 151.11682) |
5 rows × 52 columns
# List all columns and the number of missing values in each column
missing_values = df.isnull().sum()
missing_values = missing_values[missing_values > 0]
# Display the columns with their number of missing values
print("Columns with missing values and the number of missing values:")
print(missing_values)
Columns with missing values and the number of missing values: streetname 88 streettype 9811 sa1code 92091 eventprice 18741 firstadvertisedeventdate 246159 firstadvertisedeventprice 585357 lastadvertisedeventdate 246159 lastadvertisedeventprice 503555 finalresulteventdate 32 finalresulteventprice 2424 finalresulteventtypecode 69 areasize 23010 bedrooms 207325 baths 243932 parking 244650 advertlinkexternalid 169884 street_centroid_latitude 81 street_centroid_longitude 81 dtype: int64
# Replace missing 'eventprice' values with 'lastadvertisedeventprice' where available
df['eventprice'] = df['eventprice'].fillna(df['lastadvertisedeventprice'])
# Drop rows where 'eventprice' is missing
df = df.dropna(subset=['eventprice'])
# Optional: Reset index after dropping rows
df.reset_index(drop=True, inplace=True)
# List all columns and the number of missing values in each column
missing_values = df.isnull().sum()
missing_values = missing_values[missing_values > 0]
# Display the columns with their number of missing values
print("Columns with missing values and the number of missing values:")
print(missing_values)
Columns with missing values and the number of missing values: streetname 88 streettype 9706 sa1code 91754 firstadvertisedeventdate 245190 firstadvertisedeventprice 575093 lastadvertisedeventdate 245190 lastadvertisedeventprice 491201 finalresulteventdate 32 finalresulteventprice 1045 finalresulteventtypecode 68 areasize 22897 bedrooms 207279 baths 243202 parking 243921 advertlinkexternalid 169427 street_centroid_latitude 81 street_centroid_longitude 81 dtype: int64
# Replace missing 'bedrooms' and 'baths' with 0 where 'propertytype' is 'Studio'
df.loc[(df['propertytype'] == 'Studio') & (df['bedrooms'].isnull()), 'bedrooms'] = 0
df.loc[(df['propertytype'] == 'Studio') & (df['baths'].isnull()), 'baths'] = 1
# List all columns and the number of missing values in each column
missing_values = df.isnull().sum()
missing_values = missing_values[missing_values > 0]
# Display the columns with their number of missing values
print("Columns with missing values and the number of missing values:")
print(missing_values)
Columns with missing values and the number of missing values: streetname 88 streettype 9706 sa1code 91754 firstadvertisedeventdate 245190 firstadvertisedeventprice 575093 lastadvertisedeventdate 245190 lastadvertisedeventprice 491201 finalresulteventdate 32 finalresulteventprice 1045 finalresulteventtypecode 68 areasize 22897 bedrooms 205372 baths 242719 parking 243921 advertlinkexternalid 169427 street_centroid_latitude 81 street_centroid_longitude 81 dtype: int64
# Fill missing values for categorical columns
df['streetname'].fillna('Unknown', inplace=True)
df['streettype'].fillna('Unknown', inplace=True)
df['finalresulteventtypecode'].fillna('Unknown', inplace=True)
# Fill missing values for numerical columns with median or mean
df['sa1code'].fillna(df['sa1code'].median(), inplace=True)
df['firstadvertisedeventdate'].fillna(df['firstadvertisedeventdate'].mode()[0], inplace=True)
df['firstadvertisedeventprice'].fillna(df['firstadvertisedeventprice'].median(), inplace=True)
df['lastadvertisedeventdate'].fillna(df['lastadvertisedeventdate'].mode()[0], inplace=True)
df['lastadvertisedeventprice'].fillna(df['lastadvertisedeventprice'].median(), inplace=True)
df['finalresulteventdate'].fillna(df['finalresulteventdate'].mode()[0], inplace=True)
df['finalresulteventprice'].fillna(df['finalresulteventprice'].median(), inplace=True)
df['areasize'].fillna(df['areasize'].median(), inplace=True)
df['bedrooms'].fillna(0, inplace=True)
df['baths'].fillna(0, inplace=True)
df['parking'].fillna(0, inplace=True)
df['advertlinkexternalid'].fillna(df['advertlinkexternalid'].mode()[0], inplace=True)
df['street_centroid_latitude'].fillna(df['street_centroid_latitude'].median(), inplace=True)
df['street_centroid_longitude'].fillna(df['street_centroid_longitude'].median(), inplace=True)
# Drop columns with too many missing values
columns_to_drop = ['firstadvertisedeventdate', 'lastadvertisedeventdate']
df.drop(columns=columns_to_drop, inplace=True)
# Drop rows with critical missing values if necessary
df.dropna(subset=['eventprice'], inplace=True)
# Verify if there are any remaining missing values
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])
Series([], dtype: int64)
missing_values
FID 0 propertycategorisation 0 streetname 0 streettype 0 suburb 0 postcode 0 state 0 sa1code 0 eventdate 0 eventprice 0 eventtypecode 0 firstadvertisedeventprice 0 lastadvertisedeventprice 0 finalresulteventdate 0 finalresulteventprice 0 finalresulteventtypecode 0 propertytype 0 areasize 0 bedrooms 0 baths 0 parking 0 hasstudy 0 hasseparatedining 0 hasfamilyroom 0 hassunroom 0 hasrumpusroom 0 hasfireplace 0 haswalkinwardrobe 0 hascourtyard 0 hasinternallaundry 0 hasheating 0 hasairconditioning 0 hasbalcony 0 hasbarbeque 0 haspolishedtimberfloor 0 hasensuite 0 hasspa 0 hasgarage 0 haslockupgarage 0 haspool 0 hastenniscourt 0 hasalarm 0 advertlinkexternalid 0 property_latitude 0 property_longitude 0 street_centroid_latitude 0 street_centroid_longitude 0 month 0 year 0 the_geom 0 dtype: int64
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1099103 entries, 0 to 1099102 Data columns (total 50 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 FID 1099103 non-null object 1 propertycategorisation 1099103 non-null object 2 streetname 1099103 non-null object 3 streettype 1099103 non-null object 4 suburb 1099103 non-null object 5 postcode 1099103 non-null int64 6 state 1099103 non-null object 7 sa1code 1099103 non-null float64 8 eventdate 1099103 non-null object 9 eventprice 1099103 non-null float64 10 eventtypecode 1099103 non-null object 11 firstadvertisedeventprice 1099103 non-null float64 12 lastadvertisedeventprice 1099103 non-null float64 13 finalresulteventdate 1099103 non-null object 14 finalresulteventprice 1099103 non-null float64 15 finalresulteventtypecode 1099103 non-null object 16 propertytype 1099103 non-null object 17 areasize 1099103 non-null float64 18 bedrooms 1099103 non-null float64 19 baths 1099103 non-null float64 20 parking 1099103 non-null float64 21 hasstudy 1099103 non-null bool 22 hasseparatedining 1099103 non-null bool 23 hasfamilyroom 1099103 non-null bool 24 hassunroom 1099103 non-null bool 25 hasrumpusroom 1099103 non-null bool 26 hasfireplace 1099103 non-null bool 27 haswalkinwardrobe 1099103 non-null bool 28 hascourtyard 1099103 non-null bool 29 hasinternallaundry 1099103 non-null bool 30 hasheating 1099103 non-null bool 31 hasairconditioning 1099103 non-null bool 32 hasbalcony 1099103 non-null bool 33 hasbarbeque 1099103 non-null bool 34 haspolishedtimberfloor 1099103 non-null bool 35 hasensuite 1099103 non-null bool 36 hasspa 1099103 non-null bool 37 hasgarage 1099103 non-null bool 38 haslockupgarage 1099103 non-null bool 39 haspool 1099103 non-null bool 40 hastenniscourt 1099103 non-null bool 41 hasalarm 1099103 non-null bool 42 advertlinkexternalid 1099103 non-null float64 43 property_latitude 1099103 non-null float64 44 property_longitude 1099103 non-null float64 45 street_centroid_latitude 1099103 non-null float64 46 street_centroid_longitude 1099103 non-null float64 47 month 1099103 non-null int64 48 year 1099103 non-null int64 49 the_geom 1099103 non-null object dtypes: bool(21), float64(14), int64(3), object(12) memory usage: 265.2+ MB
the_geom Column¶the_geom column based on available latitude and longitude values.the_geom when necessary to ensure accurate geospatial information.the_geom and latitude/longitude values were missing.eventprice was missing but lastadvertisedeventprice was available, replaced eventprice with lastadvertisedeventprice.eventprice values to focus on complete data.propertytype was 'Studio' and both bedrooms and baths were missing.bedrooms with 0 and baths with 1 to ensure all properties had complete information.streetname, streettype, sa1code, firstadvertisedeventdate, firstadvertisedeventprice, lastadvertisedeventdate, lastadvertisedeventprice, finalresulteventdate, finalresulteventprice, finalresulteventtypecode, areasize, bedrooms, baths, parking, advertlinkexternalid, street_centroid_latitude, and street_centroid_longitude.This detailed markdown outlines the comprehensive steps taken to clean and prepare the dataset, ensuring it is ready for meaningful analysis.
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1099103 entries, 0 to 1099102 Data columns (total 50 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 FID 1099103 non-null object 1 propertycategorisation 1099103 non-null object 2 streetname 1099103 non-null object 3 streettype 1099103 non-null object 4 suburb 1099103 non-null object 5 postcode 1099103 non-null int64 6 state 1099103 non-null object 7 sa1code 1099103 non-null float64 8 eventdate 1099103 non-null object 9 eventprice 1099103 non-null float64 10 eventtypecode 1099103 non-null object 11 firstadvertisedeventprice 1099103 non-null float64 12 lastadvertisedeventprice 1099103 non-null float64 13 finalresulteventdate 1099103 non-null object 14 finalresulteventprice 1099103 non-null float64 15 finalresulteventtypecode 1099103 non-null object 16 propertytype 1099103 non-null object 17 areasize 1099103 non-null float64 18 bedrooms 1099103 non-null float64 19 baths 1099103 non-null float64 20 parking 1099103 non-null float64 21 hasstudy 1099103 non-null bool 22 hasseparatedining 1099103 non-null bool 23 hasfamilyroom 1099103 non-null bool 24 hassunroom 1099103 non-null bool 25 hasrumpusroom 1099103 non-null bool 26 hasfireplace 1099103 non-null bool 27 haswalkinwardrobe 1099103 non-null bool 28 hascourtyard 1099103 non-null bool 29 hasinternallaundry 1099103 non-null bool 30 hasheating 1099103 non-null bool 31 hasairconditioning 1099103 non-null bool 32 hasbalcony 1099103 non-null bool 33 hasbarbeque 1099103 non-null bool 34 haspolishedtimberfloor 1099103 non-null bool 35 hasensuite 1099103 non-null bool 36 hasspa 1099103 non-null bool 37 hasgarage 1099103 non-null bool 38 haslockupgarage 1099103 non-null bool 39 haspool 1099103 non-null bool 40 hastenniscourt 1099103 non-null bool 41 hasalarm 1099103 non-null bool 42 advertlinkexternalid 1099103 non-null float64 43 property_latitude 1099103 non-null float64 44 property_longitude 1099103 non-null float64 45 street_centroid_latitude 1099103 non-null float64 46 street_centroid_longitude 1099103 non-null float64 47 month 1099103 non-null int64 48 year 1099103 non-null int64 49 the_geom 1099103 non-null object dtypes: bool(21), float64(14), int64(3), object(12) memory usage: 265.2+ MB
# Display basic information about the dataset
print("Dataset Information:")
print(df.info())
Dataset Information: <class 'pandas.core.frame.DataFrame'> RangeIndex: 1099103 entries, 0 to 1099102 Data columns (total 50 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 FID 1099103 non-null object 1 propertycategorisation 1099103 non-null object 2 streetname 1099103 non-null object 3 streettype 1099103 non-null object 4 suburb 1099103 non-null object 5 postcode 1099103 non-null int64 6 state 1099103 non-null object 7 sa1code 1099103 non-null float64 8 eventdate 1099103 non-null object 9 eventprice 1099103 non-null float64 10 eventtypecode 1099103 non-null object 11 firstadvertisedeventprice 1099103 non-null float64 12 lastadvertisedeventprice 1099103 non-null float64 13 finalresulteventdate 1099103 non-null object 14 finalresulteventprice 1099103 non-null float64 15 finalresulteventtypecode 1099103 non-null object 16 propertytype 1099103 non-null object 17 areasize 1099103 non-null float64 18 bedrooms 1099103 non-null float64 19 baths 1099103 non-null float64 20 parking 1099103 non-null float64 21 hasstudy 1099103 non-null bool 22 hasseparatedining 1099103 non-null bool 23 hasfamilyroom 1099103 non-null bool 24 hassunroom 1099103 non-null bool 25 hasrumpusroom 1099103 non-null bool 26 hasfireplace 1099103 non-null bool 27 haswalkinwardrobe 1099103 non-null bool 28 hascourtyard 1099103 non-null bool 29 hasinternallaundry 1099103 non-null bool 30 hasheating 1099103 non-null bool 31 hasairconditioning 1099103 non-null bool 32 hasbalcony 1099103 non-null bool 33 hasbarbeque 1099103 non-null bool 34 haspolishedtimberfloor 1099103 non-null bool 35 hasensuite 1099103 non-null bool 36 hasspa 1099103 non-null bool 37 hasgarage 1099103 non-null bool 38 haslockupgarage 1099103 non-null bool 39 haspool 1099103 non-null bool 40 hastenniscourt 1099103 non-null bool 41 hasalarm 1099103 non-null bool 42 advertlinkexternalid 1099103 non-null float64 43 property_latitude 1099103 non-null float64 44 property_longitude 1099103 non-null float64 45 street_centroid_latitude 1099103 non-null float64 46 street_centroid_longitude 1099103 non-null float64 47 month 1099103 non-null int64 48 year 1099103 non-null int64 49 the_geom 1099103 non-null object dtypes: bool(21), float64(14), int64(3), object(12) memory usage: 265.2+ MB None
# Display basic statistics of numerical columns
print("\nDescriptive Statistics:")
print(df.describe())
Descriptive Statistics:
postcode sa1code eventprice firstadvertisedeventprice \
count 1.099103e+06 1.099103e+06 1.099103e+06 1.099103e+06
mean 2.320733e+03 1.158170e+10 1.130743e+06 7.991999e+05
std 2.404179e+02 7.563350e+08 2.179178e+06 3.706979e+06
min 2.000000e+03 1.010210e+10 0.000000e+00 1.000000e+01
25% 2.130000e+03 1.100412e+10 5.600000e+05 7.000000e+05
50% 2.251000e+03 1.170313e+10 7.930160e+05 7.000000e+05
75% 2.526000e+03 1.210414e+10 1.240000e+06 7.000000e+05
max 4.383000e+03 1.280216e+10 1.012500e+09 2.147484e+09
lastadvertisedeventprice finalresulteventprice areasize \
count 1.099103e+06 1.099103e+06 1.099103e+06
mean 8.094227e+05 1.120135e+06 5.223943e+03
std 2.631005e+06 1.824313e+06 1.455697e+06
min 1.000000e+01 0.000000e+00 1.000000e+00
25% 6.790000e+05 5.550000e+05 4.550000e+02
50% 7.000000e+05 7.900000e+05 6.700000e+02
75% 7.500000e+05 1.230000e+06 1.037000e+03
max 1.900000e+09 1.723654e+08 1.304260e+09
bedrooms baths parking advertlinkexternalid \
count 1.099103e+06 1.099103e+06 1.099103e+06 1.099103e+06
mean 2.567820e+00 1.361235e+00 1.347388e+00 1.919695e+09
std 1.601678e+00 9.931984e-01 1.205593e+00 4.291830e+08
min 0.000000e+00 0.000000e+00 0.000000e+00 8.312490e+05
25% 2.000000e+00 1.000000e+00 0.000000e+00 2.014400e+09
50% 3.000000e+00 1.000000e+00 1.000000e+00 2.016608e+09
75% 4.000000e+00 2.000000e+00 2.000000e+00 2.017870e+09
max 3.200000e+01 2.000000e+01 7.000000e+01 2.019327e+09
property_latitude property_longitude street_centroid_latitude \
count 1.099103e+06 1.099103e+06 1.099103e+06
mean -3.348315e+01 1.509860e+02 -3.348316e+01
std 1.343293e+00 1.219169e+00 1.343230e+00
min -3.725666e+01 1.412155e+02 -3.725511e+01
25% -3.393365e+01 1.508703e+02 -3.393376e+01
50% -3.381114e+01 1.510978e+02 -3.381134e+01
75% -3.342588e+01 1.512714e+02 -3.342584e+01
max -2.816608e+01 1.536366e+02 -2.816663e+01
street_centroid_longitude month year
count 1.099103e+06 1.099103e+06 1.099103e+06
mean 1.509860e+02 6.567195e+00 2.021209e+03
std 1.219083e+00 3.404034e+00 1.486901e+00
min 1.411078e+02 1.000000e+00 2.019000e+03
25% 1.508701e+02 3.000000e+00 2.020000e+03
50% 1.510985e+02 7.000000e+00 2.021000e+03
75% 1.512714e+02 1.000000e+01 2.022000e+03
max 1.536349e+02 1.200000e+01 2.024000e+03
# Display the first few rows of the dataset
print("\nFirst Few Rows:")
print(df.head())
First Few Rows:
FID propertycategorisation streetname streettype \
0 apm_point_sold_nsw.374101 House Federation Dr
1 apm_point_sold_nsw.374102 House Koel Pl
2 apm_point_sold_nsw.374103 House Stanley St
3 apm_point_sold_nsw.374104 House Peter Cr
4 apm_point_sold_nsw.374105 House Babbin Pl
suburb postcode state sa1code eventdate eventprice \
0 Medowie 2318 NSW 1.060311e+10 2021-05-28 950000.0
1 Woronora Heights 2233 NSW 1.280216e+10 2019-05-08 1050100.0
2 Mona Vale 2103 NSW 1.220214e+10 2020-08-21 3000000.0
3 Greenacre 2190 NSW 1.190114e+10 2022-05-07 1330000.0
4 Caringbah South 2229 NSW 1.280116e+10 2023-10-27 2245000.0
... hastenniscourt hasalarm advertlinkexternalid property_latitude \
0 ... False False 2.016998e+09 -32.73393
1 ... False False 2.014400e+09 -34.03078
2 ... False False 2.016396e+09 -33.67632
3 ... False False 2.017728e+09 -33.89731
4 ... False True 2.018629e+09 -34.04770
property_longitude street_centroid_latitude street_centroid_longitude \
0 151.87674 -32.73197 151.87641
1 151.02859 -34.03108 151.02864
2 151.31369 -33.67584 151.31403
3 151.05179 -33.89696 151.05268
4 151.11682 -34.04781 151.11606
month year the_geom
0 5 2021 POINT (-32.73393 151.87674)
1 5 2019 POINT (-34.03078 151.02859)
2 8 2020 POINT (-33.67632 151.31369)
3 5 2022 POINT (-33.89731 151.05179)
4 10 2023 POINT (-34.0477 151.11682)
[5 rows x 50 columns]
# Check for duplicate rows
print("\nNumber of Duplicate Rows:")
print(df.duplicated().sum())
Number of Duplicate Rows: 0
# Distribution of categorical variables
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
print(f"\n{col} Distribution:")
print(df[col].value_counts())
FID Distribution:
apm_point_sold_nsw.374101 1
apm_point_sold_nsw.1135957 1
apm_point_sold_nsw.1135943 1
apm_point_sold_nsw.1135944 1
apm_point_sold_nsw.1135945 1
..
apm_point_sold_nsw.754959 1
apm_point_sold_nsw.754960 1
apm_point_sold_nsw.754961 1
apm_point_sold_nsw.754962 1
apm_point_sold_nsw.374100 1
Name: FID, Length: 1099103, dtype: int64
propertycategorisation Distribution:
House 751553
Unit 347550
Name: propertycategorisation, dtype: int64
streetname Distribution:
Pacific 7580
Victoria 5347
George 4268
Park 4023
Railway 3539
...
Marabunga 1
Charkers 1
Thomas Hennessy 1
Barrys 1
Haverty 1
Name: streetname, Length: 26704, dtype: int64
streettype Distribution:
St 443298
Rd 197762
Av 138892
Dr 65557
Pl 49810
...
Piaz 1
Fitr 1
Est 1
Paradise 1
Cr N 1
Name: streettype, Length: 137, dtype: int64
suburb Distribution:
Port Macquarie 8274
Blacktown 7133
Orange 5766
Ryde 5671
Parramatta 5599
...
Burrier 1
Hannam Vale 1
Berkshire Park 1
Bendoura 1
Wombeyan Caves 1
Name: suburb, Length: 2468, dtype: int64
state Distribution:
NSW 1099103
Name: state, dtype: int64
eventdate Distribution:
2019-12-20 1594
2021-03-27 1580
2021-03-31 1437
2020-12-18 1377
2021-04-01 1362
...
2019-12-29 12
2023-12-26 11
2019-12-25 11
2021-12-26 9
2022-12-25 8
Name: eventdate, Length: 1947, dtype: int64
eventtypecode Distribution:
PTSD 916287
AUSD 69003
AUSP 61541
PTSW 42514
AUSA 7455
AUPN 1476
AUSN 482
AUSS 345
Name: eventtypecode, dtype: int64
finalresulteventdate Distribution:
2019-12-20 1584
2021-04-01 1466
2021-03-31 1417
2020-12-23 1402
2021-12-03 1332
...
2016-09-05 1
2018-02-16 1
2018-02-14 1
2018-10-05 1
2018-11-03 1
Name: finalresulteventdate, Length: 2147, dtype: int64
finalresulteventtypecode Distribution:
PTSD 983688
PTSW 46438
AUSD 45562
PTLA 12720
AUSP 7609
AUSA 2507
AUPN 293
AUSN 181
Unknown 68
AUSS 37
Name: finalresulteventtypecode, dtype: int64
propertytype Distribution:
House 660421
Unit 336467
Townhouse 75537
Duplex 8437
Semi 7523
Villa 4122
Studio 2601
Cottage 2079
Terrace 1871
Flat 43
Triplex 2
Name: propertytype, dtype: int64
the_geom Distribution:
POINT (-33.82402 151.19541) 558
POINT (-33.85922 151.06267) 556
POINT (-33.9158 151.03215) 436
POINT (-33.77875 151.12151) 405
POINT (-33.69337 150.92687) 388
...
POINT (-36.20432 150.11663) 1
POINT (-34.64524 150.84032) 1
POINT (-33.9281 151.23094) 1
POINT (-34.85702 148.91948) 1
POINT (-33.87655 151.16296) 1
Name: the_geom, Length: 514608, dtype: int64
# Reset index to ensure alignment
df.reset_index(drop=True, inplace=True)
# Calculate Z-scores for numerical columns
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
df_numerical = df[numerical_cols]
z_scores = pd.DataFrame(stats.zscore(df_numerical, nan_policy='omit'), columns=numerical_cols)
# Ensure the Z-scores DataFrame has the same index as the original DataFrame
z_scores.index = df_numerical.index
# Define a threshold for identifying outliers
threshold = 3
# Identify outliers (convert to boolean)
outliers = (z_scores.abs() > threshold).any(axis=1)
outliers = outliers.astype(bool) # Ensure outliers is a boolean Series
# Align the boolean Series with the original DataFrame
outliers_indexed = pd.Series(outliers, index=df.index)
# Remove outliers from the DataFrame
df_no_outliers = df[~outliers_indexed]
# Save the DataFrame without outliers
df_no_outliers.to_csv('cleaned_data_no_outliers.csv', index=False)
df = df_no_outliers
# Display unique values and counts in 'propertytype'
propertytype_counts = df['propertytype'].value_counts()
print("Unique Values in 'propertytype':")
print(propertytype_counts)
# Define a threshold for rare property types
# For example, consider property types with fewer than 1000 occurrences as outliers
threshold = 1000
# Identify rare property types
rare_property_types = propertytype_counts[propertytype_counts < threshold].index
# Remove rows with rare property types
df_cleaned_propertytype = df[~df['propertytype'].isin(rare_property_types)]
# Save the cleaned DataFrame
df_cleaned_propertytype.to_csv('cleaned_data_no_rare_propertytypes.csv', index=False)
print("Rows with rare property types have been removed.")
Unique Values in 'propertytype': House 569371 Unit 295999 Townhouse 67066 Semi 6881 Duplex 6579 Villa 3716 Studio 2249 Cottage 1903 Terrace 1755 Flat 33 Triplex 2 Name: propertytype, dtype: int64 Rows with rare property types have been removed.
pip install folium
Collecting folium
Downloading folium-0.17.0-py2.py3-none-any.whl (108 kB)
-------------------------------------- 108.4/108.4 kB 6.1 MB/s eta 0:00:00
Requirement already satisfied: requests in c:\users\dell\anaconda3\lib\site-packages (from folium) (2.28.1)
Requirement already satisfied: jinja2>=2.9 in c:\users\dell\anaconda3\lib\site-packages (from folium) (3.1.2)
Requirement already satisfied: numpy in c:\users\dell\anaconda3\lib\site-packages (from folium) (1.23.5)
Collecting branca>=0.6.0
Downloading branca-0.7.2-py3-none-any.whl (25 kB)
Collecting xyzservices
Downloading xyzservices-2024.6.0-py3-none-any.whl (83 kB)
---------------------------------------- 83.9/83.9 kB 4.6 MB/s eta 0:00:00
Requirement already satisfied: MarkupSafe>=2.0 in c:\users\dell\anaconda3\lib\site-packages (from jinja2>=2.9->folium) (2.1.1)
Requirement already satisfied: idna<4,>=2.5 in c:\users\dell\anaconda3\lib\site-packages (from requests->folium) (3.4)
Requirement already satisfied: charset-normalizer<3,>=2 in c:\users\dell\anaconda3\lib\site-packages (from requests->folium) (2.0.4)
Requirement already satisfied: certifi>=2017.4.17 in c:\users\dell\anaconda3\lib\site-packages (from requests->folium) (2023.5.7)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\users\dell\anaconda3\lib\site-packages (from requests->folium) (1.26.14)
Installing collected packages: xyzservices, branca, folium
Successfully installed branca-0.7.2 folium-0.17.0 xyzservices-2024.6.0
Note: you may need to restart the kernel to use updated packages.
import folium
from folium.plugins import HeatMap
# Ensure longitude and latitude columns are present
if 'property_longitude' in df.columns and 'property_latitude' in df.columns:
# Create a base map centered around the average latitude and longitude
center_lat = df['property_latitude'].mean()
center_lon = df['property_longitude'].mean()
base_map = folium.Map(location=[center_lat, center_lon], zoom_start=12)
# Add a heatmap layer
heat_data = [[row['property_latitude'], row['property_longitude']] for index, row in df.iterrows() if pd.notnull(row['property_latitude']) and pd.notnull(row['property_longitude'])]
HeatMap(heat_data).add_to(base_map)
# Save the map to an HTML file
base_map.save('property_distribution_map.html')
print("Map has been saved to 'property_distribution_map.html'.")
else:
print("Longitude and/or latitude columns are missing from the dataset.")
Map has been saved to 'property_distribution_map.html'.
display(base_map)